1. Clearing the workspace
2. Loading the Required Packages
library(tidyverse)
library(plotly)
library(vcfR)
library(adegenet)
library(hierfstat)
library(pegas)
library(poppr)
library(boot)
3. Loading the Data
fst_windowed = read.table(file = "/Users/vicegill/Documents/Project_2022/RUS_EUR.windowed.weir.fst", sep = "\t",header = TRUE)
4. Plotting the Fst according to the position on the chromosome
fst_windowed$index=1:nrow(fst_windowed)
threshold_for_fst<- quantile(fst_windowed$WEIGHTED_FST, c(0.975,0.995,0.9999),na.rm=TRUE)
fst_windowed <- fst_windowed %>% mutate(outlier = ifelse(fst_windowed$WEIGHTED_FST > threshold_for_fst[2], "outlier", "background"))
tallied_data <- fst_windowed %>% group_by(outlier) %>% tally()
weak <- fst_windowed[fst_windowed$WEIGHTED_FST > threshold_for_fst[1],]
medium <- fst_windowed[fst_windowed$WEIGHTED_FST > threshold_for_fst[2],]
high <- fst_windowed[fst_windowed$WEIGHTED_FST > threshold_for_fst[3],]
5. Filtering Chromosome 5
As it has the highest Fst Value and the table of very High Fst
Differentiated SNP’s
fst_windowed_5 = fst_windowed %>% filter(CHROM==5)
fst_windowed_5$index=1:nrow(fst_windowed_5)
fst_plot = fst_windowed_5 %>% ggplot(aes(x=index,y=WEIGHTED_FST,color=outlier)) +
geom_point(alpha=3/4) +
xlab("Index Position") +
ylab("Weighted Fst Value") +
ggtitle(label = "Fst Value in the Chromosome 5",subtitle = "Project 2023")
fst_plot <- ggplotly(fst_plot)
fst_pot <- layout(fst_plot, height = 200, width = 300)
fst_plot
high_5 = high <- fst_windowed_5[fst_windowed_5$WEIGHTED_FST > threshold_for_fst[3],]
print(high_5)
## CHROM BIN_START BIN_END N_VARIANTS WEIGHTED_FST MEAN_FST index outlier
## 726 5 36750001 36800000 2 0.444164 0.440951 726 outlier
## 734 5 37150001 37200000 2 0.320195 0.318490 734 outlier
6. Zooming in the Region of Maximum Differentiation
Filtering Chromosome 5 as it has the highest Fst Value.High
Differention index is 726 and 754 so we can select +/- 50 sites within
this interval (676-804)
fst_windowed_5 = fst_windowed %>% filter(CHROM==5)
fst_windowed_5$index=1:nrow(fst_windowed_5)
7. Confidence interval by bootstraping
First Graph include the Quantile Line obtained from the quantile
func showing the fst value which is higher and lower of 95 per of
SNP’s
fst_value <- fst_windowed$WEIGHTED_FST
quantile_fst = quantile(fst_value,c(0.975,0.025),na.rm = TRUE)
plot(density(fst_value),main="Density Plot of the Fst Values")
abline(v= 0.03573623)
abline(v= -0.02125245)

calculate_fst <- function(data, indices) {
# Extract the Fst values using the indices
sampled_fst <- data[indices]
# Calculate the desired statistic, such as mean or median
result <- median(sampled_fst) # Adjust as needed
return(result)
}
# Set the number of bootstrap samples
n_boot <- 10000 # Adjust as desired
# Run the bootstrap procedure
boot_result <- boot(fst_value, calculate_fst, R = n_boot)
# Calculate the confidence interval
ci <- boot.ci(boot_result, type = "basic") # Adjust the type as desired
# Extract the confidence interval bounds
ci_lower <- ci$basic[4]
ci_upper <- ci$basic[5]
# Print the confidence interval
cat("Confidence Interval: [", ci_lower, ", ", ci_upper, "]\n")
## Confidence Interval: [ 0.00167693 , 0.00198754 ]
8. Plotting fst values into the region +/- 50 of Highly
Differentiated SNP’s on chromosome 5
fst_plot = fst_windowed_5 %>% filter(index>=676 &index<= 804) %>%
ggplot(aes(x=index,y=WEIGHTED_FST,color=outlier,shape=(WEIGHTED_FST > ci_upper))) +
geom_point() +
xlab("Index Position") +
ylab("Weighted Fst Value") +
ggtitle(label = "Fst Value in the Chromosome 5",subtitle = "Project 2023") +
scale_shape_manual(name="Shape Legend",values = c(1,3),labels=c("Background by BS","Outlier by BS")) + scale_color_manual(name="Color Legend", values = c("#E76F51","#2A9D8F"),label=c("background by Q","Outlier by Q"))+ geom_vline(xintercept = 716, color = "black", linetype = "dashed", size =0.5) +
geom_vline(xintercept = 748, color = "black", linetype = "dashed", size =0.5)
fst_plot

print(paste("Lines Showing the Selected Region"))
## [1] "Lines Showing the Selected Region"
9. Finding the chromosome range where we should do the local
PCA.
fst_windowed_5 = fst_windowed_5 %>% mutate(mean_pos=(BIN_START+BIN_END)/2)
chrom_pos_lower=as.integer(fst_windowed_5[fst_windowed_5$index==716,]$mean_pos)
chrom_pos_upper=as.integer(fst_windowed_5[fst_windowed_5$index==748,]$mean_pos)
total_snps = fst_windowed_5 %>% filter(index >= 716 & index <= 748)
total_snps=sum(total_snps$N_VARIANTS)
print(paste("Chromosome Position Range ", chrom_pos_lower , "- ", chrom_pos_upper))
## [1] "Chromosome Position Range 35925000 - 37875000"
print(paste("Total number of variants within this range is ",total_snps))
## [1] "Total number of variants within this range is 84"